Here we perform some basic EDA operations.
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import feather
from pandas_profiling import ProfileReport
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sn
import psutil
from translate import Translator
os.chdir('..')
os.chdir('data')
os.getcwd()
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
For our research, we are looking at the location of 'eerste afnemers', the receivers of secondary resources. However, there is a problem - some locations of 'eerste afnemers' actually represent the location of the offices of the reusing company, rather than the actual location of reuse. For example, for the process of using rubble as a foundation for roads, the eerste afnemers' locations are the locations of the contractors' offices, not the location of the roads that are being filled.
Because of this, we need to talk to experts from LMA to determine which flows are less likely to have accurate 'eerste afnemers' locations (such as rubble), and which could be more accurate (metal, plastic). The purpose of this section is therefore to prepare a list of major flow types from the LMA dataset, which we can use to discuss with LMA experts. The types will be categorized by VMC (processing type), SBI (industry type) and EWC/GNC (material type).
# clean df
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
df = df[['eaNaam', 'gnc', 'ewc', 'vmc', 'sbi', 'kg']]
df.sbi = df.sbi.replace('nan', '0')
df = df.replace([None, 'nan', '0'], '--')
# change codes to chapters
df.gnc = df.gnc.str[:2]
df.ewc = df.ewc.str[:2]
df.vmc = df.vmc.str[:1]
# sbi chapters
df.sbi = df.sbi.str.split(',')
def chap(sbis):
newSbis = []
for sbi in sbis:
newSbis.append(sbi[:2])
return list(set(newSbis))
df.sbi = df.sbi.map(lambda x: chap(x))
df['sbiLen'] = df.sbi.map(lambda x: len(x))
df.sbi = df.sbi.map(lambda x: ','.join(x))
There are lots of different types of SBI chapters because many companies have multiple sbi codes, creating many different combinations
# flow types
ft = df.groupby(['gnc', 'ewc', 'vmc', 'sbi']).sum().reset_index().sort_values('kg', ascending=False)
# # add chapter text
# ewcC = feather.read_dataframe('classification/ewc.feather')
# ft = pd.merge(ft, ewcC, how='left', on='ewc')
# vmcC = feather.read_dataframe('classification/vmc.feather')
# ft = pd.merge(ft, vmcC, how='left', on='vmc')
# gncC = feather.read_dataframe('classification/gnc.feather')
# gncC = gncC[['Code', 'descDetail']]
# gncC.rename(columns={'Code': 'gnc', 'descDetail': 'gncDesc'}, inplace=True)
# ft = pd.merge(ft, gncC, how='left', on='gnc')
ft['type'] = 'gnc:' + ft.gnc + ' ewc:' + ft.ewc + ' vmc:' + ft.vmc + ' sbi:' + ft.sbi
ft.head()
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(ft.kg.head(20))
ax[0].legend(ft.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(ft.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(ft.kg)))
plt.show()
Here, we tried to categorize the afgifte flows using the gnc, ewc, vmc, and sbi. As seen above, there are 1864 flow types, which is a lot. The reason why there are so many flow types is because of the SBI codes - many companies have multiple sbi codes, as many as thirty!
print('how many sbi codes do companies have in the dataset?')
for i in sorted(df.sbiLen.unique()):
numCom = len(df[df.sbiLen == i])
print('{} companies have {} sbi codes'.format(numCom, i))
sbiLen = df.groupby('sbiLen').sum().sort_values('kg', ascending=False).reset_index()
fig, ax = plt.subplots(1,1,figsize=(9*2,5))
ax.pie(sbiLen.kg)
ax.legend(sbiLen.sbiLen, loc='center left', bbox_to_anchor=(1,0.5))
ax.set_title('% waste (in kg) associated with x number of sbis')
plt.show()
df[df.sbiLen == 30]
# 30 associated sbi codes of gemeente amsterdam
# maker pd.series of the 30 sbi codes
ams = df[df.sbiLen == 30].sbi.iloc[0]
ams = ams.split(',')
ams = pd.DataFrame(ams, columns=['sbi'])
# merge with sbi chapters
sbiChap = feather.read_dataframe('classification/sbi_Headings.feather')
ams = pd.merge(ams, sbiChap[['sbi', 'sbiDesc']], how='left', on='sbi')
# display
ams.head(10)
For companies with multiple SBI codes, their sbi code becomes '--'
dfOneSbi = df
dfOneSbi.loc[df.sbiLen > 1, 'sbi'] = '--'
dfOneSbi = dfOneSbi.groupby(['gnc', 'ewc', 'vmc', 'sbi']).sum().kg.reset_index()
dfOneSbi['type'] = 'gnc:' + dfOneSbi.gnc + ' ewc:' + dfOneSbi.ewc + ' vmc:' + dfOneSbi.vmc + ' sbi:' + dfOneSbi.sbi
dfOneSbi = dfOneSbi.sort_values('kg', ascending=False)
num = len(dfOneSbi.sbi.unique())
print('number of unique sbi chapters: {}'.format(num))
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(dfOneSbi.kg.head(20))
ax[0].legend(dfOneSbi.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(dfOneSbi.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(dfOneSbi.kg)))
plt.show()
Now let's try to use sbi section headers to categorize the flows.
# read sbiChap, which includes chapter headings and which section each chapter belongs to.
sbiChap = feather.read_dataframe('classification/sbi_Chapters.feather')
sbiChap = sbiChap[['section', 'sbi', 'sbiDesc']]
# make df with sbi section heads
dfSbiSec = dfOneSbi.copy()
dfSbiSec.reset_index(inplace=True, drop=True)
dfSbiSec['sbiSec'] = pd.merge(dfSbiSec.sbi, sbiChap, how='left', on='sbi').section
dfSbiSec.sbiSec.fillna('--', inplace=True)
# groupby
dfSbiSec = dfSbiSec.groupby(['gnc', 'ewc', 'vmc', 'sbiSec']).sum().reset_index().sort_values('kg', ascending=False)
dfSbiSec['type'] = 'gnc:' + dfSbiSec.gnc + ' ewc:' + dfSbiSec.ewc + ' vmc:' + dfSbiSec.vmc + ' sbi:' + dfSbiSec.sbiSec
print('number of unique sbi sections: {}'.format(len(dfSbiSec.sbiSec.unique())))
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(dfSbiSec.kg.head(20))
ax[0].legend(dfSbiSec.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(dfSbiSec.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(dfSbiSec.kg)))
plt.show()
# make copy for new df
dfGncSec = dfSbiSec.copy()
dfGncSec.drop(labels=['type'], axis=1, inplace=True)
dfGncSec.reset_index(inplace=True, drop=True)
# add gnc section headers
gncSec = feather.read_dataframe('classification/gnc_Headings.feather')
dfGncSec['gncSec'] = pd.merge(dfGncSec.gnc, gncSec[['gnc', 'section']], how='left', on='gnc').section
dfGncSec = dfGncSec[['gnc','gncSec', 'ewc', 'vmc', 'sbiSec', 'kg']]
dfGncSec.gncSec.replace(np.NaN, '--', inplace=True)
# groupby
dfGncSec = dfGncSec.groupby(['gncSec', 'ewc', 'vmc', 'sbiSec']).sum().reset_index().sort_values('kg', ascending=False)
dfGncSec['type'] = 'gnc:' + dfGncSec.gncSec + ' ewc:' + dfGncSec.ewc + ' vmc:' + dfGncSec.vmc + ' sbi:' + dfGncSec.sbiSec
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(dfGncSec.kg.head(20))
ax[0].legend(dfGncSec.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(dfGncSec.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(dfGncSec.kg)))
plt.show()
Without the sbis, there are less types - only 118 in total, see below.
ftNoSbi = ft.groupby(['gnc', 'ewc', 'vmc']).sum().kg.reset_index().sort_values('kg', ascending=False)
ftNoSbi['type'] = 'gnc:' + ftNoSbi.gnc + ' ewc:' + ftNoSbi.ewc + ' vmc:' + ftNoSbi.vmc
fig, ax = plt.subplots(1,2,figsize=(9*2,5))
ax[0].pie(ftNoSbi.kg.head(20))
ax[0].legend(ftNoSbi.type.head(20), loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('top 20 flow types from afgifte dataset')
ax[1].pie(ftNoSbi.kg)
ax[1].set_title('all flow types ({} in total)'.format(len(ftNoSbi.kg)))
plt.show()
# groupby to create pie chart (matplotlib)
# gnc
gnc = ft.groupby('gnc').sum().reset_index()
gnc = gnc.sort_values('kg', ascending=False)
gncLen = len(gnc)
gnc.loc[gnc['kg'] < 150000000, 'gnc'] = 'other'
gnc = gnc.groupby('gnc').sum().reset_index().sort_values('kg', ascending=False)
gnc = gnc[gnc.gnc != '--']
# ewc
ewc = ft.groupby('ewc').sum().reset_index()
ewc = ewc.sort_values('kg', ascending=False)
ewcLen = len(ewc)
ewc.loc[ewc['kg'] < 800000000, 'ewc'] = 'other'
ewc = ewc.groupby('ewc').sum().reset_index().sort_values('kg', ascending=False)
ewc = ewc[ewc.ewc != '--']
# vmc
vmc = ft.groupby('vmc').sum().reset_index()
vmc = vmc.sort_values('kg', ascending=False)
vmcLen = len(vmc)
# sbi
sbi = ft.groupby('sbi').sum().kg.reset_index().sort_values('kg', ascending=False)
sbiLen = len(sbi)
sbi.loc[sbi['kg'] < 161052464, 'sbi'] = 'other'
sbi = sbi.groupby('sbi').sum().reset_index().sort_values('kg', ascending=False)
# merge gnc and ewc columns to create df of material (mat for short)
gnc['mat'] = 'gnc' + gnc.gnc
ewc['mat'] = 'ewc' + ewc.ewc
mat = pd.concat([gnc, ewc])
mat.drop(labels=['gnc', 'ewc', 'sbiLen'], inplace=True, axis=1)
mat = mat[['mat', 'kg']]
mat.sort_values('kg', ascending=False, inplace=True)
fig, ax = plt.subplots(1,3,figsize=(8*3,6))
ax[0].pie(mat.kg)
ax[0].legend(mat.mat, loc='center left', bbox_to_anchor=(1,0.5))
ax[0].set_title('gnc/ewc')
ax[1].pie(vmc.kg)
ax[1].legend(vmc.vmc, loc='center left', bbox_to_anchor=(1,0.5))
ax[1].set_title('vmc')
ax[2].pie(sbi.kg)
ax[2].legend(sbi.sbi, loc='center left', bbox_to_anchor=(1,0.5))
ax[2].set_title('sbi')
# display
print('distribution of gnc, ewc, and vmc codes in afgifte dataset by weight')
plt.show()
print('# unique gnc codes: {}'.format(gncLen))
print('# unique ewc codes: {}'.format(ewcLen))
print('# unique vmc codes: {}'.format(vmcLen))
print('# unique sbi codes: {}'.format(sbiLen))
So far, we've tried to categorized the flows in the following ways:
I will pick the third categorization method and add descriptions to it. (The fourth method wasn't chosen because using GNC sections turned out to be too vague.) With this, we can work with LMA experts to identify the flow types with locations that represent the real location of reuse. If it turns out that SBI codes don't really make a difference, then I will move on to the last categorization method, which doesn't include SBI codes.
# ADD DESCRIPTIONS
# read classification files
gncDesc = feather.read_dataframe('classification/gnc_Headings.feather')
ewcDesc = feather.read_dataframe('classification/ewc.feather')
vmcDesc = feather.read_dataframe('classification/vmc.feather')
sbiDesc = feather.read_dataframe('classification/sbi_Headings.feather')
# make copy of dfGncSec (dfl stands for df for lma)
dfl = dfSbiSec.copy()
# add descriptions
dfl = pd.merge(dfl, gncDesc[['gnc', 'gncDesc']], how='left', on='gnc') # gnc
dfl = pd.merge(dfl, ewcDesc, how='left', on='ewc') # ewc
dfl = pd.merge(dfl, vmcDesc, how='left', on='vmc') # vmc
dfl = pd.merge(dfl, sbiDesc[['sbi', 'sbiDesc']], how='left', left_on='sbiSec', right_on='sbi') # sbi
# rearrange columns
dfl = dfl[['gnc', 'gncDesc', 'ewc', 'ewcDesc', 'vmc', 'vmcDesc', 'sbiSec', 'sbiDesc', 'kg']]
# remove np.NaN
dfl.replace(np.NaN, '--', inplace=True)
# display all flow types with descriptions
dfl.head(10)
# make different flow types
dfNoSbi = dfl.groupby(['gnc', 'gncDesc', 'ewc', 'ewcDesc', 'vmc', 'vmcDesc']).sum().sort_values('kg', ascending=False).reset_index() # flow types without sbi
dfMat = dfl.groupby(['gnc', 'gncDesc', 'ewc', 'ewcDesc']).sum().sort_values('kg', ascending=False).reset_index() # material flow types (ewc, gnc)
dfPro = dfl.groupby(['vmc', 'vmcDesc']).sum().sort_values('kg', ascending=False).reset_index() # processing flow types (vmc)
import xlsxwriter
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('lma/flowTypes.xlsx', engine='xlsxwriter')
# save as excel file with multiple sheets
dfl.to_excel(writer, sheet_name='flowTypes_allCodes')
dfNoSbi.to_excel(writer, sheet_name='flowTypes_noSbi')
dfMat.to_excel(writer, sheet_name='flowTypes_mat')
dfPro.to_excel(writer, sheet_name='flowTypes_pro')
# Close the Pandas Excel writer and output the Excel file.
writer.save()
There seem to be some combinations of material & processing codes (e.g. reused construction and demolition waste) that are associated with all SBI codes. This means that all industries, from financial services to construction companies, are involved in reusing C&D waste (see example below).
Not all flow types are associated with multiple SBIs, and this section will further explain this.
row = dfl.iloc[2]
mask = (dfl.gnc == row.gnc) & (dfl.ewc == row.ewc) & (dfl.vmc == row.vmc)
dfl[mask]
# finding number of sbis associated for each flow type (flow type = material + processing type)
# count number of sbis associated for each flow type
def numSbi(row):
mask = (dfl.gnc == row.gnc) & (dfl.ewc == row.ewc) & (dfl.vmc == row.vmc)
return len(dfl[mask])
dfNoSbi['numSbi'] = dfNoSbi.apply(lambda row: numSbi(row), axis=1)
# number of flow types with x number of associated sbis
numSbi = dfNoSbi.groupby('numSbi').count().gnc.reset_index().sort_values('numSbi')
numSbi.rename(columns={'gnc': 'count'}, inplace=True)
# kg of waste with x number of associated sbis
kgSbi = dfNoSbi.groupby('numSbi').sum().sort_values('kg', ascending=False).reset_index()
# pie chart explaining number of SBIs associated with major flows
fig, ax = plt.subplots(1,2,figsize=(9*2,6))
# rows with x number of associated sbis (by count)
ax[0].pie(numSbi['count'])
ax[0].legend(numSbi.numSbi, loc='center left', bbox_to_anchor=(1,0.5), title='# associated sbis')
ax[0].set_title('% of flow types with x number of associated sbis')
# kg of waste with x number of associated sbis (by weight)
ax[1].pie(kgSbi.kg)
ax[1].legend(kgSbi.numSbi, loc='center left', bbox_to_anchor=(1,0.5), title='# associated sbis')
ax[1].set_title('kg of waste types with x number of associated sbis')
plt.show()
def asSbi(gnc, ewc, vmc):
mask = (dfl.gnc == gnc) & (dfl.ewc == ewc) & (dfl.vmc == vmc)
return dfl[mask]
asSbi('--', '17', 'B') # flow type with 20 SBIs: reused C&D waste
asSbi('23', '--', 'B') # flow type with 1 sbi: reused food waste
asSbi('--', '17', 'E') # flow type with 2 sbis: composted c&d waste
asSbi('--', '15', 'D') # flow type with 3 sbis: mechanically treated waste packaging
Questions:
SBI codes are actually the same as NACE codes - the first 4 digits of SBI are the same as the first 4 digits of NACE. The following section headings can be considered as 'making':
Note: I would like to do all three industry types, because this allows us to identify similarities and differences of the different industries. However, as you will see in the following sections, the three industries differ quite a bit in terms of amount of kg received, as well as spatial distribution of the waste received. You could argue that the flows are so different that they're not even comparable.
sbi = feather.read_dataframe('classification/sbi_Headings.feather')
secMake = ['A', 'C', 'F']
sbiMake = sbi[sbi.section.isin(secMake)]
print('number of chapters included as making industry: {}'.format(len(sbiMake) - len(secMake)))
print(sbiMake.sbi.unique())
sbiMake.head()
# make dataframe of kg associated with sbi codes
dfs = ft.groupby('sbi').sum().kg.reset_index().sort_values('kg', ascending=False)
# mark out multiple sbi codes as 'mul'
def sbiMul(x):
if len(x) > 2:
return 'mul'
else:
return x
dfs.sbi = dfs.sbi.map(lambda x: sbiMul(x))
dfs = dfs.groupby('sbi').sum().reset_index().sort_values('kg', ascending=False)
# merge with section headings
dfs = pd.merge(dfs, sbi, how='left', on='sbi')
dfs.section = dfs.section.replace(np.NaN, '--')
def na(row):
if row.section == '--':
row.section = row.sbi
return row
dfs = dfs.apply(lambda row: na(row), axis=1)
dfs = dfs.groupby(['section']).sum().reset_index().sort_values('kg', ascending=False)
# matched?
def match(x):
if x == '--':
return 'sbi unknown'
elif x == 'mul':
return 'multiple sbis'
else:
return 'matched'
dfs['match'] = dfs.section.map(lambda x: match(x))
# making vs non-making
def making(x):
if x in secMake:
return x
elif x == '--':
return 'sbi unknown'
elif x == 'mul':
return 'multiple sbis'
else:
return 'non-maker'
dfs['maker'] = dfs.section.map(lambda x: making(x))
# display
dfs.head()
# pie chart explaining number of SBIs associated with major flows
fig, ax = plt.subplots(1,3,figsize=(8*3,6))
# explode -- and mul values
explode = np.zeros(len(dfs))
explode[[0,2]] = 0.2
# rows with x number of associated sbis (by count)
ax[0].pie(dfs.kg)
ax[0].legend(dfs.section, loc='center left', bbox_to_anchor=(1,0.5), title='sbi section')
ax[0].set_title('sbi code sections (by weight)')
# matched vs unmatched sbis
dfss = dfs.groupby('match').sum().reset_index().sort_values('kg', ascending=False)
ax[1].pie(dfss.kg)
ax[1].legend(dfss.match, loc='center left', bbox_to_anchor=(1,0.5))
ax[1].set_title('matched vs unmatched sbi codes (by weight)')
# maker vs non-maker sbis
dfsm = dfs.groupby('maker').sum().reset_index().sort_values('kg', ascending=False)
colors = ['#e8e8e8', '#adadad', '#41ab66', '#e8e8e8', '#639beb', '#e84f4f']
ax[2].pie(dfsm.kg, colors=colors)
ax[2].legend(dfsm.maker, loc='center left', bbox_to_anchor=(1,0.5))
ax[2].set_title('makers vs non-makers (by weight)')
plt.show()
As seen in the second pie chart, almost half of the flows from the dataset aren't matched with an SBI code, either because the eerste afnemer is associated with multiple sbi codes, or we were unable to match an sbi code at all. If we are depending on industry type to categorize the flows, it is essential to improve the sbi matching process. Rusne's code would be a good reference for improvement.
Again, the three relevant SBI sections for us are:
For the three making industries categorized above, what are their associated materials and processing types?
dfMake = dfl[dfl.sbiSec.isin(['A', 'C', 'F'])]
dfMake = dfMake.sort_values('kg', ascending=False)
# material column
def mat(row):
if row.gnc != '--':
mat = 'GNC- ' + row.gncDesc
else:
mat = 'EWC- ' + row.ewcDesc
return mat
dfMake['mat'] = dfMake.apply(lambda row: mat(row), axis=1)
dfMake.mat = dfMake.mat.str[:80]
dfMat = dfMake.groupby(['sbiSec', 'sbiDesc', 'mat']).sum().reset_index().sort_values('kg', ascending=False)
fig = px.bar(dfMat, x="sbiDesc", y="kg", color="mat", title="Materials received by eerste afnemers in each industry")
fig.show()
dfp = dfMake.groupby(['sbiSec', 'sbiDesc', 'vmc', 'vmcDesc']).sum().reset_index()
dfp.vmcDesc = dfp.vmcDesc.str[:80]
def vmc(row):
vmcDesc = row.vmc + ' - ' + row.vmcDesc
return vmcDesc
dfp.vmcDesc = dfp.apply(lambda row: vmc(row), axis=1)
dfp = dfp.sort_values('kg', ascending=False)
dfp.head()
fig = px.bar(dfp, x="sbiDesc", y="kg", color="vmcDesc", title="Processes associated wtih eerste afnemers in each industry")
fig.show()
I don't really understand why 'G - you dump the waste' can be an option in the afgifte dataset - so are landfills and incinerators also included as eerste afnemers in the afgifte dataset?
Questions:
ft.head()
gncDesc = feather.read_dataframe('classification/gnc_Headings.feather')
ewcDesc = feather.read_dataframe('classification/ewc.feather')
# read gnc classification codes
print(gncDesc[gncDesc.level == 1].shape)
gncDesc[gncDesc.level == 1]
# read ewc classification codes
print(ewcDesc[ewcDesc.ewc.str.len() == 2].shape)
ewcDesc[ewcDesc.ewc.str.len() == 2]
# create df of kg produced per material
mat = ft.groupby(['gnc', 'ewc']).sum().kg.reset_index()
mat.head()
# what codes are in the data?
print('ewc codes: {}\n'.format(mat.ewc.unique()))
print('gnc codes: {}'.format(mat.gnc.unique()))
As seen above, there is something wrong with the codes.
gncBad = mat[mat.gnc.isin(['00', '99'])][['gnc', 'kg']]
matSum = mat.kg.sum()
gncBad['percKg'] = gncBad.kg.map(lambda x: round(x/matSum*100, 2))
print('% of bad gnc codes by weight: {}'.format(gncBad.percKg.sum()))
gncBad